#include <assert.h>
#include <stdlib.h>

// CUDA runtime
#include <cublas_v2.h>
#include <cuda_runtime.h>
#include "helper.h"

// CUDA and CUBLAS functions

void SGEMM_GPU(cublasHandle_t handle, int m, int n, int k, float *d_A, float *d_B, float *d_C) {

  const float alpha = 1.0f;
  const float beta = 0.0f;
  checkGPUErrors(cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, 
                        n, m, k,
                        &alpha,
                        d_B, CUDA_R_32F, n,
                        d_A, CUDA_R_32F, m,
                        &beta,
                        d_C, CUDA_R_32F, n,
                        CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
}
